home *** CD-ROM | disk | FTP | other *** search
- /*
- Anagram Programmer's Challenge
-
- by Larry Landry
-
- This implementation uses a large amount of memory to
- optimize the CPU utilization. To guarantee that we
- have enough memory for all matching words, we actually
- allocate an array of pointers for 30,000 words. Since
- the rules stated that there would be about 20,000 words
- in the dictionary, even if every word matched, we would
- still have enough storage. In reality this number
- could probably be less than 1-200 for all but the most
- rare of scenarios.
-
- The basic algorithm is:
- 1) Convert the input string into a table of counts for
- each character from a-z. So "sammy" would have a
- count of 2 for "m" and 1 for each of "s", "a", and "y".
- This makes testing for the presence of a character
- as simple as checking and indexed value in an array.
- 2) Parse through the dictionary and find the words that
- can be composed of some portion of characters from
- the input characters. Build a list of pointers to
- each word. The number of words in this list will
- be in the tens instead of thousands.
- 3) Recursively process the words in this list and find
- strings of words that use up all of the characters.
- For each matching sequence, output the words to
- the file.
-
- The processing requrired by this algorithm is then
- D * C1 + M * log2(M) * C2
- where
- D = size of input dictionary
- M = number of matching words
- C1 & C2 are constants
- This algorithm works very well for cases where there are
- few words that match the input letters. The worst case
- scenario where all words can be made from the input letters
- will still take a very long time. I expect that matching
- words will typically be less than 100.
-
- */
-
- #include <stdio.h>
-
- typedef unsigned char uchar;
- typedef unsigned short ushort;
- typedef unsigned long ulong;
-
- #define MAX_WORDS 30000L
- #define OUTPUT_BUFFER_SIZE 10000L
- #define RETURN '\n'
-
- typedef struct {
- char*fWordStart;
- short fWordLength;
- } WordLoc;
-
- /* Usage counts for each character
- (only indexes 'a' to 'z' are actually used) */
-
- typedef uchar CharData[256];
-
- unsigned long Anagram(Str255 inputText, FILE *wordList,
- FILE *outputFile);
-
- ulong findInputWords(register char *wordBuffer,
- WordLoc *validWords);
- ulong findAnagrams(short numValidChars, ulong wordCount,
- WordLoc *validWords, short prevWordCount);
-
-
- /* I use some global variables here to avoid passing them
- down into the recursive routine findAnagrams(). These values
- are constant once findAnagrams() is called. */
-
- char gOutputBuffer[OUTPUT_BUFFER_SIZE];
- char *gOutputBufferEnd = gOutputBuffer +
- OUTPUT_BUFFER_SIZE - 512;
- char *gOutputPtr;
- CharData gValidChars;
- WordLoc *gWordsInUse[255];
- FILE *gOutputFile;
-
-
- unsigned long Anagram(Str255 inputText, FILE *wordList,
- FILE *outputFile)
- {
- fpos_t wordBufferLength;
- char*wordBuffer;
- short index;
- short numValidChars;
- WordLoc validWords[MAX_WORDS];
- char ch;
- ulong wordCount;
-
- gOutputFile = outputFile;
- gOutputPtr = gOutputBuffer;
-
- /* To save on file I/O time, read the whole file
- all at once. First, find the length of the file
- by seeking the end and finding the file pos.
- Then allocate a buffer of that size, plus 2 bytes
- (for a return and NULL char) and read the data
- into it. Finally put the return and NULL char
- at the end. */
-
- fseek(wordList, 0L, SEEK_END);
- fgetpos(wordList, &wordBufferLength);
- wordBuffer = (char*) NewPtr((Size) wordBufferLength + 2);
- if (wordBuffer == NULL)
- return 0L; /* real error handling here */
- rewind(wordList);
- fread(wordBuffer, (size_t) 1,
- (size_t) wordBufferLength, wordList);
- if (wordBuffer[wordBufferLength-1] != RETURN)
- wordBuffer[wordBufferLength++] = RETURN;
- wordBuffer[wordBufferLength] = '\0';
-
- /* To save time ruling out words, we build a
- list of the valid characters in the words. We start
- with no valid characters. */
- for (index='a'; index<'z'; index++)
- gValidChars[index] = 0;
-
- /* Now build the list of valid characters.
- Each array entry will be a count of how
- many times that character is present. */
-
- numValidChars = *inputText++;
- for (index=numValidChars; index>0; index--)
- if ((ch = *inputText++) != ' ')
- gValidChars[ch]++;
- else
- numValidChars--;
-
- /* Find the list of words that can be made up from the
- letters in the input word */
-
- wordCount = findInputWords(wordBuffer, &validWords[MAX_WORDS-1]);
-
- /* Now find the list of full anagrams that can be
- created from these words */
-
- wordCount = findAnagrams(numValidChars, wordCount,
- &validWords[MAX_WORDS-wordCount], 0);
-
- /* Write the results to the output */
- *gOutputPtr = 0;/* Terminate the string */
- fprintf(outputFile, gOutputBuffer);
-
- DisposPtr(wordBuffer);
-
- return wordCount;
- } /* Anagram */
-
-
- ulong findInputWords(register char *wordBuffer,
- WordLoc *validWords)
- {
- char*saveStart = wordBuffer;
- ulong numberWords = 0;
- char ch;
-
- while (*wordBuffer)
- {
- ch = *wordBuffer++;
- if (ch == RETURN)
- {
- /* Record this entry as a valid word */
- numberWords++;
- validWords->fWordStart = saveStart;
- validWords->fWordLength = (short)(wordBuffer -
- saveStart - 1);
- validWords--;
-
- wordBuffer--;
- while (saveStart < wordBuffer)
- gValidChars[*saveStart++]++;
-
- /* Save the new start of word pointer */
- saveStart = ++wordBuffer;
- } else if (gValidChars[ch])
- gValidChars[ch]--;
- else
- {
- /* This word didn't match so reset and go to the
- next word */
-
- wordBuffer--;
- while (saveStart < wordBuffer)
- gValidChars[*saveStart++]++;
- while (*wordBuffer++ != RETURN)
- ;
-
- /* Save the new start of word pointer */
- saveStart = wordBuffer;
- } /* else */
- } /* while */
-
- return numberWords;
- } /* findInputWords */
-
-
- ulong findAnagrams(short numValidChars, ulong wordCount,
- WordLoc *validWords, short prevWordCount)
- {
- ulong wordIndex;
- ulong usedIndex;
- short chIndex;
- ulong matchCount = 0;
- Boolean wordFits;
- char ch;
- char*tempPtr;
- WordLoc *theWord;
-
- /* Try each word we have against the list of characters. */
-
- for (wordIndex=0; wordIndex<wordCount; wordIndex++)
- {
- /* If there aren't enough characters left,
- it can't be a match */
-
- if (validWords->fWordLength <= numValidChars)
- {
- /* Go through the chars in this word testing to
- make sure that there is at least one of each char
- available */
- wordFits = TRUE;
- for (chIndex=0; chIndex<validWords->fWordLength; chIndex++)
- {
- ch = validWords->fWordStart[chIndex];
- if (gValidChars[ch])
- gValidChars[ch]--;
- else
- {
- /* Found an unavailable character, so this
- can't be part of the anagram. Reset the
- character usage array and go to the next
- word. */
-
- wordFits = FALSE;
- while (--chIndex >= 0)
- gValidChars[validWords->fWordStart[chIndex]]++;
- break; /* get out of the for loop */
- } /* else */
- } /* for */
-
- if (wordFits)
- {
- /* This word fit, so see if it uses all the characters.
- If so, then we have found an anagram. Output the
- anagram and increment the anagram count. */
-
- if (validWords->fWordLength == numValidChars)
- {
- matchCount++;
- /* Copy the previous words for this anagram
- separated by spaces. */
- for (usedIndex=0; usedIndex<prevWordCount; usedIndex++)
- {
- theWord = gWordsInUse[usedIndex];
- memcpy(gOutputPtr, theWord->fWordStart,
- (size_t) theWord->fWordLength);
- gOutputPtr += theWord->fWordLength;
- *gOutputPtr++ = ' ';
- } /* for */
-
- /* Now copy this new word and a return character */
- memcpy(gOutputPtr, validWords->fWordStart,
- (size_t) validWords->fWordLength);
- gOutputPtr += validWords->fWordLength;
- *gOutputPtr++ = RETURN;
-
- /* To ensure that we don't overrun the output buffer
- check against the end of the buffer. If the end
- pointer has been passed, write the data to the file
- and reset the output pointer to the beginning of
- the buffer. */
-
- if (gOutputPtr > gOutputBufferEnd)
- {
- *gOutputPtr = 0;/* Terminate the string */
- fprintf(gOutputFile, gOutputBuffer);
- gOutputPtr = gOutputBuffer;
- } /* if */
- }
- else
- {
- /* This word did fit, but didn't use all of the
- characters so add it to the list of previous words
- in the anagram and then call this procedure
- recursively to find if there are more words that
- can be added to make an anagram with this base. */
-
- gWordsInUse[prevWordCount] = validWords;
- matchCount += findAnagrams(
- numValidChars - validWords->fWordLength,
- wordCount - wordIndex, validWords,
- prevWordCount + 1);
- } /* else */
-
- /* Now undo the characters we took out of the validChar array */
- for (chIndex=0; chIndex<validWords->fWordLength; chIndex++)
- gValidChars[validWords->fWordStart[chIndex]]++;
- } /* if */
- } /* if */
-
- validWords++;
- } /* for */
-
- return matchCount;
- } /* findAnagrams */
-